library(tidyverse)
library(gapminder)
Also see tidycensus and SQL
(See more in Urban Institute’s Do No Harm Guide)
as_tibble() transforms existing data into a tibble
data %>%
as_tibble()tibble() creates a tibble from new data
tibble(x = 1:5,
y = 1,
z = x ^ 2 + y)| x | y | z |
|---|---|---|
| 1 | 1 | 2 |
| 2 | 1 | 5 |
| 3 | 1 | 10 |
| 4 | 1 | 17 |
| 5 | 1 | 26 |
readr (part of tidyverse) covers basics like csv files
delim = ";" specifies the non-comma delimiter ("\t" is tab, " " is space)skip = n skips the first n linescomment = "#" drops all lines that start with (e.g.) #col_names = FALSE tells read_csv() not to treat the first row as headings and instead label them from X1 to Xncol_names = c("x", "y", "z") renames the columns as the character vectorcol_types = cols() reads in the columns as the specified data typena = 999 replaces dataset-specific NA values (e.g., 999) as NAread_csv("path/to/file.csv")
read_rds("path/to/file.rds")
read_delim("path/to/file.txt", delim = ";")#Best Practice
csv_data <- read_csv("path/to/file.csv",
col_types = cols(w = col_double(),
x = col_date(format = ""),
y = col_character(),
z = col_logical()))haven reads SPSS, Stata, and SAS files
library(haven)
read_sas("path/to/file.sas7bdat")
read_spss("path/to/file.spv")
read_stata("path/to/file.dta")readxl reads excel files (both .xls and .xlsx)
library(readxl)
read_excel("path/to/file.xlsx")DBI, along with a database specific backend (e.g. RMySQL, RSQLite, RPostgreSQL etc), runs SQL queries (See SQL)
library(DBI)
con <- "connection to database"
dbGetQuery(con, "
SELECT *
FROM table
") %>% as_tibble()jsonlite reads json files
xml2 reads XML
write_csv() exports a plain csv
dataset %>% write_csv("path/dataset.csv")write_rds() preserves formatting as an rds file
dataset %>% write_rds("path/dataset.rds")nrow() returns the number of rows
gapminder %>% nrow()## [1] 1704
ncol() returns the number of columns
gapminder %>% ncol()## [1] 6
names() returns a list of all of the columns
gapminder %>% names()## [1] "country" "continent" "year" "lifeExp" "pop" "gdpPercap"
distinct() returns a list of all unique values in a specified column
gapminder %>% distinct(continent)| continent |
|---|
| Asia |
| Europe |
| Africa |
| Americas |
| Oceania |
summary() gives basic stats on numeric columns, but I prefer this custom function
col_summaries <- function(.data){
all_cols <- .data %>% names()
test_col <- function(.test_col){
test_data <- .data %>%
as_tibble() %>%
rename(test = .test_col) %>%
mutate_if(is.logical, as.character)
if(dim(test_data %>% filter(!is.na(test)))[1] == 0){
test_data %>%
summarize(column = .test_col,
type = class(test_data$test),
missing = sum(is.na(test)),
unique_values = NA_real_,
example = NA_character_,
min = NA_real_,
q25 = NA_real_,
median = NA_real_,
q75 = NA_real_,
max = NA_real_)
} else if(class(test_data$test) %in% c("integer", "numeric")){
test_data %>%
summarize(column = .test_col,
type = class(test_data$test),
missing = sum(is.na(test)),
unique_values = NA_real_,
example = NA_character_,
min = min(test, na.rm = TRUE),
q25 = quantile(test, 0.25, na.rm = TRUE),
median = median(test, na.rm = TRUE),
q75 = quantile(test, 0.75, na.rm = TRUE),
max = max(test, na.rm = TRUE))
} else {
test_data %>%
summarize(column = .test_col,
type = class(test_data$test),
missing = sum(is.na(test)),
unique_values = n_distinct(test)) %>%
bind_cols(test_data %>%
filter(!is.na(test)) %>%
select(test) %>%
head(1) %>%
rename(example = test)) %>%
mutate(min = NA_real_,
q25 = NA_real_,
median = NA_real_,
q75 = NA_real_,
max = NA_real_ )
}
}
map_dfr(all_cols, test_col)
}gapminder %>% col_summaries()| column | type | missing | unique_values | example | min | q25 | median | q75 | max |
|---|---|---|---|---|---|---|---|---|---|
| country | factor | 0 | 142 | Afghanistan | NA | NA | NA | NA | NA |
| continent | factor | 0 | 5 | Asia | NA | NA | NA | NA | NA |
| year | integer | 0 | NA | NA | 1952.0000 | 1965.750 | 1979.5000 | 1.993250e+03 | 2.007000e+03 |
| lifeExp | numeric | 0 | NA | NA | 23.5990 | 48.198 | 60.7125 | 7.084550e+01 | 8.260300e+01 |
| pop | integer | 0 | NA | NA | 60011.0000 | 2793664.000 | 7023595.5000 | 1.958522e+07 | 1.318683e+09 |
| gdpPercap | numeric | 0 | NA | NA | 241.1659 | 1202.060 | 3531.8470 | 9.325462e+03 | 1.135231e+05 |
na = or recode sets what counts as missing
#When importing
dataframe <- read_csv("datasource.csv",
na = c("", "NA", "UNKNOWN", 999))
#When already read-in
dateframe <- dataframe %>%
mutate(char_col = recode(char_col, "NA" = NA_character_),
num_col = recode(num_col, 999 = NA_integer_))any(is.na()) checks whether there are ANY missing values
#Overall
any(is.na(gapminder))## [1] FALSE
#In a specific column
any(is.na(gapminder$lifeExp))## [1] FALSE
summarize(sum(is.na())) counts the missing values within each column
gapminder %>%
summarize(observations = n(),
missing_lifeExp = sum(is.na(lifeExp)))| observations | missing_lifeExp |
|---|---|
| 1704 | 0 |
A list of expected values can also check completeness
expected <- c("Africa" = 54,
"Americas" = 35,
"Asia" = 47,
"Europe" = 43,
"Oceania" = 14)
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(countries_represented = n()) %>%
mutate(countries_expected = expected[continent],
missing_countries = countries_expected - countries_represented)| continent | countries_represented | countries_expected | missing_countries |
|---|---|---|---|
| Africa | 52 | 54 | 2 |
| Americas | 25 | 35 | 10 |
| Asia | 33 | 47 | 14 |
| Europe | 30 | 43 | 13 |
| Oceania | 2 | 14 | 12 |
table() creates a cross tab count
gap_07 <- gapminder %>%
filter(year==2007) %>%
mutate(country_size = case_when(pop > 3.121e+07 ~ "large",
pop < 4.508e+06 ~ "small",
TRUE ~ "average"),
country_size = factor(country_size,
ordered = TRUE,
levels = c("large", "average", "small")))
table(gap_07$continent, gap_07$country_size)##
## large average small
## Africa 10 25 17
## Americas 6 13 6
## Asia 13 14 6
## Europe 7 17 6
## Oceania 0 1 1
prop.table() creates a frequency table, but I prefer this custom function for formatting
gap_07 <- gapminder %>%
filter(year==2007) %>%
mutate(country_size = case_when(pop > 3.121e+07 ~ "large",
pop < 4.508e+06 ~ "small",
TRUE ~ "average"),
country_size = factor(country_size,
ordered = TRUE,
levels = c("large", "average", "small")))
size_table <- table(gap_07$continent, gap_07$country_size)
props <- function(col1, col2, prop_type){
cross_tab <- table(col1, col2)
if(prop_type == "overall"){
prop.table(cross_tab) %>%
as.data.frame() %>%
as_tibble() %>%
spread(col2, Freq) %>%
janitor::adorn_totals("col") %>%
janitor::adorn_totals("row") %>%
as_tibble() %>%
mutate_at(vars(-col1), scales::percent, accuracy = 0.1) %>%
rename(` ` = col1)
} else if(prop_type == "row"){
prop.table(cross_tab, 2) %>%
as.data.frame() %>%
as_tibble() %>%
spread(col2, Freq) %>%
janitor::adorn_totals("row") %>%
as_tibble() %>%
mutate_at(vars(-col1), scales::percent, accuracy = 0.1) %>%
rename(` ` = col1)
} else{
prop.table(cross_tab, 1) %>%
as.data.frame() %>%
as_tibble() %>%
spread(col2, Freq) %>%
janitor::adorn_totals("col") %>%
as_tibble() %>%
mutate_at(vars(-col1), scales::percent, accuracy = 0.1) %>%
rename(` ` = col1)
}
}#Overall Proportions
props(gap_07$continent, gap_07$country_size, "overall")| large | average | small | Total | |
|---|---|---|---|---|
| Africa | 7.0% | 17.6% | 12.0% | 36.6% |
| Americas | 4.2% | 9.2% | 4.2% | 17.6% |
| Asia | 9.2% | 9.9% | 4.2% | 23.2% |
| Europe | 4.9% | 12.0% | 4.2% | 21.1% |
| Oceania | 0.0% | 0.7% | 0.7% | 1.4% |
| Total | 25.4% | 49.3% | 25.4% | 100.0% |
#Column Proportions
props(gap_07$continent, gap_07$country_size, "col")| large | average | small | Total | |
|---|---|---|---|---|
| Africa | 19.2% | 48.1% | 32.7% | 100.0% |
| Americas | 24.0% | 52.0% | 24.0% | 100.0% |
| Asia | 39.4% | 42.4% | 18.2% | 100.0% |
| Europe | 23.3% | 56.7% | 20.0% | 100.0% |
| Oceania | 0.0% | 50.0% | 50.0% | 100.0% |
#Row Proportions
props(gap_07$continent, gap_07$country_size, "row")| large | average | small | |
|---|---|---|---|
| Africa | 27.8% | 35.7% | 47.2% |
| Americas | 16.7% | 18.6% | 16.7% |
| Asia | 36.1% | 20.0% | 16.7% |
| Europe | 19.4% | 24.3% | 16.7% |
| Oceania | 0.0% | 1.4% | 2.8% |
| Total | 100.0% | 100.0% | 100.0% |
cor() calculated the correlation between two variables, but this function will compare each variable to all other variables
library(reshape2)
get_pretty_cormap <- function (.data, .numcols) {
cormat <- round(cor(select(.data, .numcols)), 2)
reorder_cormat <- function(cormat){
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]}
cormat <- reorder_cormat(cormat)
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)}
upper_tri <- get_upper_tri(cormat)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
ggplot(melted_cormat, aes(Var2, Var1, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1),
space = "Lab", name = "Pearson\nCorrelation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) +
coord_fixed() +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal") +
guides(fill = guide_colorbar(barwidth = 7, barheight = 1, title.position = "top", title.hjust = 0.5))}get_pretty_cormap(gapminder, c("year", "lifeExp", "pop", "gdpPercap"))
dataframe %>%
gather(-c(cols to keep),
key = key name,
value = value name,
factor_key = TRUE)
ratings %>%
gather(-Season,
key = Episode,
value = Rating,
factor_key = TRUE)dataframe %>%
spread(key = col to become new cols,
value = col to become values under new cols,
fill = value to fill for missing)
roster %>%
spread(key = Variable,
value = Rating)filter(!is.na()) removes entire rows with missing values in the specified column
gapminder %>%
filter(!is.na(lifeExp)) replace_na() replaces missing values
gapminder %>%
mutate(pop = replace_na(pop, 0))distinct() keeps unique rows and removes all others that are completely identical
data %>%
distinct()distinct(col, .keep_all = TRUE) (i.e., adding a column as an argument) just looks at the values within that column regardless of whether the other columns are the same (this will keep the first occurrence, so arrange as desired first)
#Most recent entry for each country
gapminder %>%
arrange(desc(year)) %>%
distinct(country, .keep_all = TRUE) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 2007 | 43.828 | 31889923 | 974.5803 |
| Albania | Europe | 2007 | 76.423 | 3600523 | 5937.0295 |
| Algeria | Africa | 2007 | 72.301 | 33333216 | 6223.3675 |
| Angola | Africa | 2007 | 42.731 | 12420476 | 4797.2313 |
| Argentina | Americas | 2007 | 75.320 | 40301927 | 12779.3796 |
| Australia | Oceania | 2007 | 81.235 | 20434176 | 34435.3674 |
Causes:
#Visual Check
boxplot(gap_07$pop)All of the circles are technically outliers. We know that the two extremes (China and India) are, in fact, correct. If there was a third that was that extreme, it would be an error that should be removed or replaced. Similarly, if there was a negative population, that would also be an error that should be removed or replaced.
#3SD Check
upper_out <- mean(gap_07$pop) + (3 * sd(gap_07$pop))
lower_out <- mean(gap_07$pop) - (3 * sd(gap_07$pop))
gap_07 %>%
filter(pop > upper_out | pop < lower_out)| country | continent | year | lifeExp | pop | gdpPercap | country_size |
|---|---|---|---|---|---|---|
| China | Asia | 2007 | 72.961 | 1318683096 | 4959.115 | large |
| India | Asia | 2007 | 64.698 | 1110396331 | 2452.210 | large |
filter() selects all observations (rows) with values matching the specification(s)
| Code | Meaning |
|---|---|
== |
Equal to |
!= |
Not equal to |
> |
Greater than |
>= |
Greater than or equal to |
< |
Less than |
<= |
Less than or equal to |
%in% c() |
Within the character vector |
!(. %in% c()) |
Not within the character vector |
is.na() |
Is NA |
!is.na() |
Is not NA |
between(x, y) |
Is between x and y |
, or & |
And |
| |
Or |
gapminder %>%
filter(between(year, 2000, 2020),
!(continent %in% c("Africa", "Americas", "Asia")),
!is.na(gdpPercap)) %>%
filter(lifeExp > 81 |
gdpPercap > 40000) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Australia | Oceania | 2007 | 81.235 | 20434176 | 34435.37 |
| Iceland | Europe | 2007 | 81.757 | 301931 | 36180.79 |
| Ireland | Europe | 2007 | 78.885 | 4109086 | 40676.00 |
| Norway | Europe | 2002 | 79.050 | 4535591 | 44683.98 |
| Norway | Europe | 2007 | 80.196 | 4627926 | 49357.19 |
| Switzerland | Europe | 2007 | 81.701 | 7554661 | 37506.42 |
select() selects all variables (columns) called for
starts_with("abc") selects columns that begin with “abc”ends_with("xyz") selects columns that end with “xyz”contains("ijk") selects columns that contain “ijk”matches("(.)\\1") selects columns that match a regular expressioneverything() selects everything else (useful for reorganizing columns)#Just
gapminder %>%
select(country, year, pop) %>%
head()| country | year | pop |
|---|---|---|
| Afghanistan | 1952 | 8425333 |
| Afghanistan | 1957 | 9240934 |
| Afghanistan | 1962 | 10267083 |
| Afghanistan | 1967 | 11537966 |
| Afghanistan | 1972 | 13079460 |
| Afghanistan | 1977 | 14880372 |
#Not
gapminder %>%
select(-c(continent, pop)) %>%
head()| country | year | lifeExp | gdpPercap |
|---|---|---|---|
| Afghanistan | 1952 | 28.801 | 779.4453 |
| Afghanistan | 1957 | 30.332 | 820.8530 |
| Afghanistan | 1962 | 31.997 | 853.1007 |
| Afghanistan | 1967 | 34.020 | 836.1971 |
| Afghanistan | 1972 | 36.088 | 739.9811 |
| Afghanistan | 1977 | 38.438 | 786.1134 |
#Range from this to that
gapminder %>%
select(continent:pop) %>%
head()| continent | year | lifeExp | pop |
|---|---|---|---|
| Asia | 1952 | 28.801 | 8425333 |
| Asia | 1957 | 30.332 | 9240934 |
| Asia | 1962 | 31.997 | 10267083 |
| Asia | 1967 | 34.020 | 11537966 |
| Asia | 1972 | 36.088 | 13079460 |
| Asia | 1977 | 38.438 | 14880372 |
rename(new name = old name) renames the column
gapminder %>%
rename(population = pop) %>%
head()| country | continent | year | lifeExp | population | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1007 |
| Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.1971 |
| Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.9811 |
| Afghanistan | Asia | 1977 | 38.438 | 14880372 | 786.1134 |
separate() splits each value within a column based on some separator
gapminder %>%
separate(col = lifeExp,
into = c("Years", "Months"),
sep = "[.]") %>%
mutate(Months = as.double(str_c("0.", Months)) * 12) %>%
head()| country | continent | year | Years | Months | pop | gdpPercap |
|---|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28 | 9.612 | 8425333 | 779.4453 |
| Afghanistan | Asia | 1957 | 30 | 3.984 | 9240934 | 820.8530 |
| Afghanistan | Asia | 1962 | 31 | 11.964 | 10267083 | 853.1007 |
| Afghanistan | Asia | 1967 | 34 | 0.240 | 11537966 | 836.1971 |
| Afghanistan | Asia | 1972 | 36 | 1.056 | 13079460 | 739.9811 |
| Afghanistan | Asia | 1977 | 38 | 5.256 | 14880372 | 786.1134 |
unite() merges the values of two columns into a new column, placing some separator between the values
gapminder %>%
unite(col = location,
country, continent,
sep = ", ") %>%
head()| location | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|
| Afghanistan, Asia | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan, Asia | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan, Asia | 1962 | 31.997 | 10267083 | 853.1007 |
| Afghanistan, Asia | 1967 | 34.020 | 11537966 | 836.1971 |
| Afghanistan, Asia | 1972 | 36.088 | 13079460 | 739.9811 |
| Afghanistan, Asia | 1977 | 38.438 | 14880372 | 786.1134 |
mutate() adds columns to the existing dataframe
gapminder %>%
mutate(gdp = gdpPercap * as.double(pop)) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap | gdp |
|---|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453 | 6567086330 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530 | 7585448670 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1007 | 8758855797 |
| Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.1971 | 9648014150 |
| Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.9811 | 9678553274 |
| Afghanistan | Asia | 1977 | 38.438 | 14880372 | 786.1134 | 11697659231 |
transmute() adds columns and then only selects the specified columns
gapminder %>%
transmute(country,
year,
gdp = gdpPercap * as.double(pop)) %>%
head()| country | year | gdp |
|---|---|---|
| Afghanistan | 1952 | 6567086330 |
| Afghanistan | 1957 | 7585448670 |
| Afghanistan | 1962 | 8758855797 |
| Afghanistan | 1967 | 9648014150 |
| Afghanistan | 1972 | 9678553274 |
| Afghanistan | 1977 | 11697659231 |
as.character()as.double()as.integer()as.logical()gapminder %>%
mutate(pop = as.double(pop)) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1007 |
| Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.1971 |
| Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.9811 |
| Afghanistan | Asia | 1977 | 38.438 | 14880372 | 786.1134 |
factor() recategorizes strings as categorical data; ordered = TRUE orders the categories by the levels = c() argument (otherwise it will be alphabetical)
gapminder %>%
mutate(country = factor(country),
continent = factor(continent,
ordered = TRUE,
levels = c("Asia", "Americas", "Europe",
"Africa", "Oceania"))) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1007 |
| Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.1971 |
| Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.9811 |
| Afghanistan | Asia | 1977 | 38.438 | 14880372 | 786.1134 |
case_when() creates a new field of categorical data (that still needs to be factored)
gapminder %>%
filter(year == 2007) %>%
mutate(country_size = case_when(pop > 3.121e+07 ~ "Large",
pop < 4.508e+06 ~ "Small",
TRUE ~ "Average"),
country_size = factor(country_size,
ordered = TRUE,
levels = c("Small", "Average", "Large"))) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap | country_size |
|---|---|---|---|---|---|---|
| Afghanistan | Asia | 2007 | 43.828 | 31889923 | 974.5803 | Large |
| Albania | Europe | 2007 | 76.423 | 3600523 | 5937.0295 | Small |
| Algeria | Africa | 2007 | 72.301 | 33333216 | 6223.3675 | Large |
| Angola | Africa | 2007 | 42.731 | 12420476 | 4797.2313 | Average |
| Argentina | Americas | 2007 | 75.320 | 40301927 | 12779.3796 | Large |
| Australia | Oceania | 2007 | 81.235 | 20434176 | 34435.3674 | Average |
recode(old value = new value) replaces all occurrences of the old value with the new value
gapminder %>%
mutate(continent = recode(continent,
"Asia" = "Asia & Pacific Islands",
"Oceania" = "Australia & New Zealand")) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia & Pacific Islands | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan | Asia & Pacific Islands | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan | Asia & Pacific Islands | 1962 | 31.997 | 10267083 | 853.1007 |
| Afghanistan | Asia & Pacific Islands | 1967 | 34.020 | 11537966 | 836.1971 |
| Afghanistan | Asia & Pacific Islands | 1972 | 36.088 | 13079460 | 739.9811 |
| Afghanistan | Asia & Pacific Islands | 1977 | 38.438 | 14880372 | 786.1134 |
A lookup table recodes all values faster:
library(hflights)
hflights %>%
count(UniqueCarrier) %>%
arrange(desc(n)) %>%
head()| UniqueCarrier | n |
|---|---|
| XE | 73053 |
| CO | 70032 |
| WN | 45343 |
| OO | 16061 |
| MQ | 4648 |
| US | 4082 |
airline_lookup_table <- c("AA" = "American",
"AS" = "Alaska",
"B6" = "JetBlue",
"CO" = "Continental",
"DL" = "Delta",
"OO" = "SkyWest",
"UA" = "United",
"US" = "US_Airways",
"WN" = "Southwest",
"EV" = "Atlantic_Southeast",
"F9" = "Frontier",
"FL" = "AirTran",
"MQ" = "American_Eagle",
"XE" = "ExpressJet",
"YV" = "Mesa")
hflights %>%
mutate(UniqueCarrier = airline_lookup_table[UniqueCarrier]) %>%
count(UniqueCarrier) %>%
arrange(desc(n)) %>%
head()| UniqueCarrier | n |
|---|---|
| ExpressJet | 73053 |
| Continental | 70032 |
| Southwest | 45343 |
| SkyWest | 16061 |
| American_Eagle | 4648 |
| US_Airways | 4082 |
First order with arrange() and arrange(desc()), then lead() and lag() pull the leading or lagging value
gapminder %>%
filter(country == "United States",
year > 1990) %>%
arrange(year) %>%
transmute(year,
gdpPercap,
prior_gdpPercap = lag(gdpPercap),
next_gdpPercap = lead(gdpPercap))| year | gdpPercap | prior_gdpPercap | next_gdpPercap |
|---|---|---|---|
| 1992 | 32003.93 | NA | 35767.43 |
| 1997 | 35767.43 | 32003.93 | 39097.10 |
| 2002 | 39097.10 | 35767.43 | 42951.65 |
| 2007 | 42951.65 | 39097.10 | NA |
First order with arrange() and arrange(desc()), then cum_() does a rolling analysis
gapminder %>%
filter(country == "United States",
year > 1990) %>%
arrange(year) %>%
transmute(year,
gdpPercap,
rolling_sum = cumsum(gdpPercap),
rolling_mean = cummean(gdpPercap),
min_to_date = cummin(gdpPercap),
max_to_date = cummax(gdpPercap))| year | gdpPercap | rolling_sum | rolling_mean | min_to_date | max_to_date |
|---|---|---|---|---|---|
| 1992 | 32003.93 | 32003.93 | 32003.93 | 32003.93 | 32003.93 |
| 1997 | 35767.43 | 67771.37 | 33885.68 | 32003.93 | 35767.43 |
| 2002 | 39097.10 | 106868.46 | 35622.82 | 32003.93 | 39097.10 |
| 2007 | 42951.65 | 149820.12 | 37455.03 | 32003.93 | 42951.65 |
Calling a column by name analyzes the whole column
gapminder %>%
filter(year == 2007) %>%
transmute(country,
continent,
lifeExp,
world_lifeExp = mean(lifeExp, na.rm = TRUE),
above_world_lifeExp = lifeExp > world_lifeExp,
pop) %>%
arrange(desc(pop)) %>%
head()| country | continent | lifeExp | world_lifeExp | above_world_lifeExp | pop |
|---|---|---|---|---|---|
| China | Asia | 72.961 | 67.00742 | TRUE | 1318683096 |
| India | Asia | 64.698 | 67.00742 | FALSE | 1110396331 |
| United States | Americas | 78.242 | 67.00742 | TRUE | 301139947 |
| Indonesia | Asia | 70.650 | 67.00742 | TRUE | 223547000 |
| Brazil | Americas | 72.390 | 67.00742 | TRUE | 190010647 |
| Pakistan | Asia | 65.483 | 67.00742 | FALSE | 169270617 |
ntile() categorizes the data into n even groupsrank() ranks the columngapminder %>%
filter(year == 2007) %>%
transmute(country,
continent,
lifeExp,
lifeExpQuantile = ntile(desc(lifeExp), 4),
lifeExpRank = rank(desc(lifeExp)),
pop) %>%
arrange(desc(pop)) %>%
head()| country | continent | lifeExp | lifeExpQuantile | lifeExpRank | pop |
|---|---|---|---|---|---|
| China | Asia | 72.961 | 2 | 59 | 1318683096 |
| India | Asia | 64.698 | 3 | 91 | 1110396331 |
| United States | Americas | 78.242 | 1 | 30 | 301139947 |
| Indonesia | Asia | 70.650 | 3 | 80 | 223547000 |
| Brazil | Americas | 72.390 | 2 | 68 | 190010647 |
| Pakistan | Asia | 65.483 | 3 | 89 | 169270617 |
The janitor package can adorn_totals()
#Row total
gapminder %>%
filter(continent == "Oceania",
year > 1990) %>%
select(country, year, pop) %>%
spread(year, pop) %>%
janitor::adorn_totals("col") %>%
as_tibble()| country | 1992 | 1997 | 2002 | 2007 | Total |
|---|---|---|---|---|---|
| Australia | 17481977 | 18565243 | 19546792 | 20434176 | 76028188 |
| New Zealand | 3437674 | 3676187 | 3908037 | 4115771 | 15137669 |
#Column total
gapminder %>%
filter(continent == "Oceania",
year > 1990) %>%
select(country, year, pop) %>%
spread(year, pop) %>%
janitor::adorn_totals("row") %>%
as_tibble()| country | 1992 | 1997 | 2002 | 2007 |
|---|---|---|---|---|
| Australia | 17481977 | 18565243 | 19546792 | 20434176 |
| New Zealand | 3437674 | 3676187 | 3908037 | 4115771 |
| Total | 20919651 | 22241430 | 23454829 | 24549947 |
#Both totals
gapminder %>%
filter(continent == "Oceania",
year > 1990) %>%
select(country, year, pop) %>%
spread(year, pop) %>%
janitor::adorn_totals("row") %>%
janitor::adorn_totals("col") %>%
as_tibble()| country | 1992 | 1997 | 2002 | 2007 | Total |
|---|---|---|---|---|---|
| Australia | 17481977 | 18565243 | 19546792 | 20434176 | 76028188 |
| New Zealand | 3437674 | 3676187 | 3908037 | 4115771 | 15137669 |
| Total | 20919651 | 22241430 | 23454829 | 24549947 | 91165857 |
group_by can perform any analysis within a group instead of over the whole column
gapminder %>%
filter(year == 2007) %>%
transmute(country,
continent,
lifeExp,
overall_quartile = ntile(desc(lifeExp), 4)) %>%
group_by(continent) %>%
mutate(continent_quantile = ntile(desc(lifeExp), 4)) %>%
ungroup() %>%
arrange(continent) %>%
head()| country | continent | lifeExp | overall_quartile | continent_quantile |
|---|---|---|---|---|
| Algeria | Africa | 72.301 | 2 | 1 |
| Angola | Africa | 42.731 | 4 | 4 |
| Benin | Africa | 56.728 | 4 | 2 |
| Botswana | Africa | 50.728 | 4 | 3 |
| Burkina Faso | Africa | 52.295 | 4 | 3 |
| Burundi | Africa | 49.580 | 4 | 3 |
mutate_all, mutate_at, and mutate_if analyze multiple columns simultaneously with the same function
#All
gapminder %>%
mutate_all(as.character) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453145 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530296 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.10071 |
| Afghanistan | Asia | 1967 | 34.02 | 11537966 | 836.1971382 |
| Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.9811058 |
| Afghanistan | Asia | 1977 | 38.438 | 14880372 | 786.11336 |
#Just
gapminder %>%
mutate_at(vars(lifeExp, gdpPercap), round, 1) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.8 | 8425333 | 779.4 |
| Afghanistan | Asia | 1957 | 30.3 | 9240934 | 820.9 |
| Afghanistan | Asia | 1962 | 32.0 | 10267083 | 853.1 |
| Afghanistan | Asia | 1967 | 34.0 | 11537966 | 836.2 |
| Afghanistan | Asia | 1972 | 36.1 | 13079460 | 740.0 |
| Afghanistan | Asia | 1977 | 38.4 | 14880372 | 786.1 |
#All but
gapminder %>%
mutate_at(vars(-country, -continent), as.double) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1007 |
| Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.1971 |
| Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.9811 |
| Afghanistan | Asia | 1977 | 38.438 | 14880372 | 786.1134 |
#Like
gapminder %>%
mutate_at(vars(matches("gdp")), round, 1) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.9 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1 |
| Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.2 |
| Afghanistan | Asia | 1972 | 36.088 | 13079460 | 740.0 |
| Afghanistan | Asia | 1977 | 38.438 | 14880372 | 786.1 |
#If
gapminder %>%
mutate_if(is.numeric, scales::comma, accuracy = 0.1) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1,952.0 | 28.8 | 8,425,333.0 | 779.4 |
| Afghanistan | Asia | 1,957.0 | 30.3 | 9,240,934.0 | 820.9 |
| Afghanistan | Asia | 1,962.0 | 32.0 | 10,267,083.0 | 853.1 |
| Afghanistan | Asia | 1,967.0 | 34.0 | 11,537,966.0 | 836.2 |
| Afghanistan | Asia | 1,972.0 | 36.1 | 13,079,460.0 | 740.0 |
| Afghanistan | Asia | 1,977.0 | 38.4 | 14,880,372.0 | 786.1 |
summarize() groups and returns a single summary statistic
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(count = n(),
missing = sum(is.na(lifeExp)),
unique_countries = n_distinct(country),
low_lifeExp = min(lifeExp, na.rm = TRUE),
high_lifeExp = max(lifeExp, na.rm = TRUE),
mean_lifeExp = mean(lifeExp, na.rm = TRUE),
median_lifeEx = median(lifeExp, na.rm = TRUE),
above_world_mean = sum(lifeExp > mean(gapminder %>%
filter(year == 2007) %>%
.$lifeExp,
na.rm = TRUE)),
share_above_world_mean = above_world_mean / count) %>%
ungroup()| continent | count | missing | unique_countries | low_lifeExp | high_lifeExp | mean_lifeExp | median_lifeEx | above_world_mean | share_above_world_mean |
|---|---|---|---|---|---|---|---|---|---|
| Africa | 52 | 0 | 52 | 39.613 | 76.442 | 54.80604 | 52.9265 | 7 | 0.1346154 |
| Americas | 25 | 0 | 25 | 60.916 | 80.653 | 73.60812 | 72.8990 | 23 | 0.9200000 |
| Asia | 33 | 0 | 33 | 43.828 | 82.603 | 70.72848 | 72.3960 | 23 | 0.6969697 |
| Europe | 30 | 0 | 30 | 71.777 | 81.757 | 77.64860 | 78.6085 | 30 | 1.0000000 |
| Oceania | 2 | 0 | 2 | 80.204 | 81.235 | 80.71950 | 80.7195 | 2 | 1.0000000 |
str_trim() cuts out spaces at the beginning and end of the string
str_trim(" this is a test ")## [1] "this is a test"
str_pad() adds a number (width) of characters (pad) at the beginning (left) or end (right) of a string
county_codes <- c("1", "51", "201")
str_pad(county_codes,
width = 3,
side = "left",
pad = "0")## [1] "001" "051" "201"
parse_number() drops all non-numeric characters
parse_number("$1,234,567")## [1] 1234567
str_replace(string, old, new) swaps out old for new characters
#Replace First
str_replace("bananas","a","o")## [1] "bonanas"
#Replace All
str_replace_all("bananas","a","o")## [1] "bononos"
str_remove(string, old) just drops the old characters
#Remove First
str_remove("bananas","a")## [1] "bnanas"
#Remove All
str_remove_all("bananas","a")## [1] "bnns"
str has some capitalization conventions (though setting all to lower is best for matching)
text <- "nEw YoRk CiTy"
#All Lower
str_to_lower(text)## [1] "new york city"
#All Upper
str_to_upper(text)## [1] "NEW YORK CITY"
#First Word Capitalized
str_to_sentence(text)## [1] "New york city"
#All Words Capitalized
str_to_title(text)## [1] "New York City"
library(lubridate)A date-time is a point on the timeline stored as the number of sections since 1970-01-01 00:00:00 UTC
as_datetime(1511870400) ## [1] "2017-11-28 12:00:00 UTC"
A date is a point on the timeline stored as the number of days since 1970-01-01
as_date(17498)## [1] "2017-11-28"
A time is a point on the timeline stored as the number of seconds since 00:00:00
hms::as_hms(10230)## 02:50:30
now() returns the date-time
now()## [1] "2022-02-10 17:46:04 EST"
today() returns the date
today()## [1] "2022-02-10"
The functions match the order of the components. For example, 08/07/06 could be:
mdy("08/07/06")## [1] "2006-08-07"
dmy("08/07/06")## [1] "2006-07-08"
ymd("08/07/06")## [1] "2008-07-06"
ydm("08/07/06")## [1] "2008-06-07"
It also works for more complicated renderings like “The 4th of July, 2018”
dmy("The 4th of July, 2018")## [1] "2018-07-04"
And for time
mdy_hms("08/07/06 10:09:30")## [1] "2006-08-07 10:09:30 UTC"
For more complex strings, the format can be specified:
y = year (2021 or 21, but defaults to current century)
m = numerical month (1-12 or 01-12)
b = month name (January or Jan)
d = day of the month (1-31 or 01-31)
w = numerical day of the week (0-6 with Sunday as 0)
a = day of the week name (Monday or Mon)
_ = split between date and time
H = 24 hour (0-24 or 00-24)
I = am/pm hour (1-12 or 01-12)
p = am or pm when using I
M = minute (0-59 or 00-59)
s = second (0-61 or 00-61)
parse_date_time("Monday June 1st 2010 at 4pm", orders = "amdy_Ip")## [1] "2010-06-01 16:00:00 UTC"
tz() checks the timezone
tz(ymd_hms("2017-03-11 12:00:00"))## [1] "UTC"
tz = sets the timezone
ymd_hms("2017-03-11 12:00:00", tz = "America/Los_Angeles")## [1] "2017-03-11 12:00:00 PST"
force_tz() changes an incorrect timezone (without changing the time value)
force_tz(ymd_hms("2017-03-11 12:00:00"), tzone = "America/New_York")## [1] "2017-03-11 12:00:00 EST"
with_tz converts to a new timezone (thus changing the time value)
with_tz(ymd_hms("2017-03-11 12:00:00"), tzone = "America/New_York")## [1] "2017-03-11 07:00:00 EST"
date() = drops the time
year() = year
month() = month
day() = number day of the month
yday() = number day of the year (1-365)
wday() = day of the week
hour() = hour
min() = minute
leap_year() = true/false for leap year
dst() = true/false for daylight savings
quarter() = quarter of year
semester() = half of year
label = TRUE for word instead of number
abbr = TRUE for abbreviated word
str_c("Today is day ", day(today()),
" of the month of ", month(today(), label = TRUE, abbr = FALSE),
", which is a ", wday(today(), label = TRUE, abbr = FALSE),
". It is also day ", yday(today()),
" of ", year(today()),
".") ## [1] "Today is day 10 of the month of February, which is a Thursday. It is also day 41 of 2022."
round_date() gives the nearest unit
round_date(ymd("2021-06-30"), unit = "month")## [1] "2021-07-01"
ceiling_date() rounds up
ceiling_date(ymd_hm("2021-06-30 12:50"), unit = "15 minutes")## [1] "2021-06-30 13:00:00 UTC"
floor_date() rounds down
floor_date(ymd_hm("2021-06-30 12:50"), unit = "hour")## [1] "2021-06-30 12:00:00 UTC"
Time does not behave like general number lines. To evaluate, you must ask which is more important: the datetime in the real world or the length of time (duration)?
seconds() / dseconds()minutes() / dminutes()hours() / dhours()days() / ddays()weeks() / dweeks()months() / dmonths()years() / dyears()#Normal Day
normal_day <- ymd_hm("2018-01-01 24:00", tz = "America/New_York")
period <- normal_day + hours(3)
duration <- normal_day + dhours(3)
tibble(Difference = difftime(period, duration, units = "hours"),
Period = period,
Duration = duration)| Difference | Period | Duration |
|---|---|---|
| 0 hours | 2018-01-02 03:00:00 | 2018-01-02 03:00:00 |
#Daylight Savings
daylight_savings_start <- ymd_hm("2018-03-10 24:00", tz = "America/New_York")
period <- daylight_savings_start + hours(3)
duration <- daylight_savings_start + dhours(3)
tibble(Difference = difftime(period, duration, units = "hours"),
Period = period,
Duration = duration)| Difference | Period | Duration |
|---|---|---|
| -1 hours | 2018-03-11 03:00:00 | 2018-03-11 04:00:00 |
#Exit Daylight Savings
daylight_savings_end <- ymd_hm("2018-11-03 24:00", tz = "America/New_York")
period <- daylight_savings_end + hours(3)
duration <- daylight_savings_end + dhours(3)
tibble(Difference = difftime(period, duration, units = "hours"),
Period = period,
Duration = duration)| Difference | Period | Duration |
|---|---|---|
| 1 hours | 2018-11-04 03:00:00 | 2018-11-04 02:00:00 |
#Normal Year
normal_year <- ymd("2018-09-20", tz = "America/New_York")
period <- normal_year + years(1)
duration <- normal_year + years(1)
tibble(Difference = difftime(period, duration, units = "days"),
Period = period,
Duration = duration)| Difference | Period | Duration |
|---|---|---|
| 0 days | 2019-09-20 | 2019-09-20 |
#Leap Year
leap_year <- ymd("2019-09-20", tz = "America/New_York")
period <- leap_year + years(1)
duration <- leap_year + dyears(1)
tibble(Difference = difftime(period, duration, units = "days"),
Period = period,
Duration = duration)| Difference | Period | Duration |
|---|---|---|
| 0.75 days | 2020-09-20 | 2020-09-19 06:00:00 |
For months, imaginary dates (e.g. Feb 31) need to be dealt with:
ymd("2019-1-31") + months(1)## [1] NA
ymd("2019-1-31") %>% add_with_rollback(months(1))## [1] "2019-02-28"
ymd("2019-1-31") %>% add_with_rollback(months(1), roll_to_first = TRUE)## [1] "2019-03-01"
ymd("2019-1-31") + dmonths(1)## [1] "2019-03-02 10:30:00 UTC"
difftime() measures the duration between two real datetimes
secsminshoursdaysweeks#Normal Day
difftime(ymd_hm("2018-01-02 03:00", tz = "America/New_York"),
ymd_hm("2018-01-01 24:00", tz = "America/New_York"),
unit = "days")## Time difference of 0.125 days
#Enter Daylight Savings
difftime(ymd_hm("2018-03-11 03:00", tz = "America/New_York"),
ymd_hm("2018-03-10 24:00", tz = "America/New_York"),
unit = "days")## Time difference of 0.08333333 days
#Exit Daylight Savings
difftime(ymd_hm("2018-11-04 03:00", tz = "America/New_York"),
ymd_hm("2018-11-03 24:00", tz = "America/New_York"),
unit = "days")## Time difference of 0.1666667 days
interval() creates an interval
ryan <- interval(ymd("1990-11-04"), today())
ryan## [1] 1990-11-04 UTC--2022-02-10 UTC
int_length() measures the length
int_length(ryan) %>% dseconds()## [1] "986774400s (~31.27 years)"
int_shift() modifies the interval
int_shift(ryan, days(5))## [1] 1990-11-09 UTC--2022-02-15 UTC
%within% determines if a date is within the interval
the_90s <- interval(ymd("1990-01-01"), ymd("1999-12-31"))
today() %within% the_90s## [1] FALSE
int_overlaps() determines if two intervals overlap
the_1900s <- interval(ymd("1900-01-01"), ymd("1999-12-31"))
the_90s %within% the_1900s## [1] TRUE
the_1900s %within% the_90s## [1] FALSE
int_overlaps(the_90s, the_1900s)## [1] TRUE
inner_join()full_join()left_join()right_join()# Basic
joined_data <- data1 %>%
inner_join(data2, by = "common_col")
# Multiple keys
joined_data <- data1 %>%
inner_join(data2, by = c("common_col1", "common_col2")
# Key with different name
joined_data <- data1 %>%
inner_join(data2, by = c("col from df1" = "col from df2"))sec_council <- tibble(country = c("United States", "United Kingdom", "France",
"Russia", "China"))semi_join() returns data from one dataset that is present in a second dataset
gapminder %>%
filter(year == 2007) %>%
semi_join(sec_council, by = "country") | country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| China | Asia | 2007 | 72.961 | 1318683096 | 4959.115 |
| France | Europe | 2007 | 80.657 | 61083916 | 30470.017 |
| United Kingdom | Europe | 2007 | 79.425 | 60776238 | 33203.261 |
| United States | Americas | 2007 | 78.242 | 301139947 | 42951.653 |
anti_join() returns data from one dataset that is NOT present in a second dataset
gapminder %>%
filter(year == 2007) %>%
anti_join(sec_council, by = "country") %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 2007 | 43.828 | 31889923 | 974.5803 |
| Albania | Europe | 2007 | 76.423 | 3600523 | 5937.0295 |
| Algeria | Africa | 2007 | 72.301 | 33333216 | 6223.3675 |
| Angola | Africa | 2007 | 42.731 | 12420476 | 4797.2313 |
| Argentina | Americas | 2007 | 75.320 | 40301927 | 12779.3796 |
| Australia | Oceania | 2007 | 81.235 | 20434176 | 34435.3674 |
df1 <- tibble("A" = c(1, 2, 3),
"B" = c(6, 5, 4))
df2 <- tibble("A" = c(4, 5, 6),
"B" = c(3, 2, 1),
"C" = c("L", "M", "N"))
df3 <- tibble("B" = c(4, 5, 6),
"C" = c("1", "2", "3"))bind_rows() tacks on new rows
df1 %>%
bind_rows(df2)| A | B | C |
|---|---|---|
| 1 | 6 | NA |
| 2 | 5 | NA |
| 3 | 4 | NA |
| 4 | 3 | L |
| 5 | 2 | M |
| 6 | 1 | N |
bind_cols() tacks on new columns
df1 %>%
bind_cols(df3)| A | B…2 | B…3 | C |
|---|---|---|---|
| 1 | 6 | 4 | 1 |
| 2 | 5 | 5 | 2 |
| 3 | 4 | 6 | 3 |
country_z <- gapminder %>%
filter(year == 2007,
str_detect(str_to_lower(country), "z"))
country_w <- gapminder %>%
filter(year == 2007,
str_detect(str_to_lower(country), "w"))union() returns all, removing duplicates
country_z %>%
union(country_w) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Bosnia and Herzegovina | Europe | 2007 | 74.852 | 4552198 | 7446.2988 |
| Brazil | Americas | 2007 | 72.390 | 190010647 | 9065.8008 |
| Czech Republic | Europe | 2007 | 76.486 | 10228744 | 22833.3085 |
| Mozambique | Africa | 2007 | 42.082 | 19951656 | 823.6856 |
| New Zealand | Oceania | 2007 | 80.204 | 4115771 | 25185.0091 |
| Swaziland | Africa | 2007 | 39.613 | 1133066 | 4513.4806 |
intersect() returns only observations in both
country_z %>%
intersect(country_w) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| New Zealand | Oceania | 2007 | 80.204 | 4115771 | 25185.0091 |
| Swaziland | Africa | 2007 | 39.613 | 1133066 | 4513.4806 |
| Switzerland | Europe | 2007 | 81.701 | 7554661 | 37506.4191 |
| West Bank and Gaza | Asia | 2007 | 73.422 | 4018332 | 3025.3498 |
| Zimbabwe | Africa | 2007 | 43.487 | 12311143 | 469.7093 |
setdiff() returns only observations in the first that are not in the second
country_z %>%
setdiff(country_w) %>%
head()| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Bosnia and Herzegovina | Europe | 2007 | 74.852 | 4552198 | 7446.2988 |
| Brazil | Americas | 2007 | 72.390 | 190010647 | 9065.8008 |
| Czech Republic | Europe | 2007 | 76.486 | 10228744 | 22833.3085 |
| Mozambique | Africa | 2007 | 42.082 | 19951656 | 823.6856 |
| Tanzania | Africa | 2007 | 52.517 | 38139640 | 1107.4822 |
| Venezuela | Americas | 2007 | 73.747 | 26084662 | 11415.8057 |